{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9bdf9f60",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import geopandas as gpd\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "import folium\n",
    "import folium.plugins as fp\n",
    "from openai import OpenAI\n",
    "client = OpenAI(api_key='api_key')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "48417b70",
   "metadata": {},
   "source": [
    "### 1. General BUA Preparation + Initial Open Data Enrichment (BUA ONS Data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c0bbf3d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "bua_gdf = gpd.read_file(\"Data/ONS_BUA_2022_GB_BGG.gpkg\").to_crs(\"EPSG:4326\")\n",
    "bua_gdf_ew = bua_gdf[bua_gdf['BUA22CD'].str[0] != \"S\"].reset_index(drop=True).drop(columns=['BUA22NMW', 'BUA22NMG', 'BNG_E', 'BNG_N'])\n",
    "bua_gdf_ew['area_ha_2dp'] = round((bua_gdf_ew.to_crs('EPSG:27700')['geometry'].area / 10000), 2)\n",
    "ons_characteristics_data = {}\n",
    "for i in range(1,11): # load in and concantenate data for England and Wales from ONS BUA characteristics data from 2021\n",
    "    ons_characteristics_data['characteristic_data_' + str(i)] = pd.concat((pd.read_excel(\"Data/townsandcitiescharacteristicsofbuiltupareasenglandandwalescensus2021.xlsx\", sheet_name=(str(i)+\"c\"), skiprows=2), (pd.read_excel(\"Data/townsandcitiescharacteristicsofbuiltupareasenglandandwalescensus2021.xlsx\", sheet_name=(str(i)+\"d\"), skiprows=2))), ignore_index=True)\n",
    "bua_gdf_ew_pop = bua_gdf_ew.merge(ons_characteristics_data['characteristic_data_1'], how='inner', left_on='BUA22CD', right_on='BUA code').drop(columns=['Country code', 'Region code', 'BUA22CD', 'BUA22NM', 'LONG', 'LAT'])\n",
    "bua_gdf_ew_pop.columns = ['geometry', 'Area in ha (2dp)', 'Country', 'Region', 'BUA Code', 'BUA Name', 'BUA Size Classification', '2021 Resident Population']\n",
    "pubs_bua_data = pd.read_excel(\"Data/accesstoamenitiespubs.xlsx\", sheet_name=\"Table 2\", skiprows=5).iloc[:,[0,1,2,3]]\n",
    "worship_bua_data = pd.read_excel(\"Data/accesstoamenitiesreligiousworship.xlsx\", sheet_name=\"Table 2\", skiprows=5).iloc[:,[0,1,2,16]]\n",
    "communities_bua_data = pd.read_excel(\"Data/accesstoamenitiescommunityfacilities.xlsx\", sheet_name=\"Table 2\", skiprows=5).iloc[:,[0,1,2,8]]\n",
    "bua_other_characteristics_df = pubs_bua_data.merge(worship_bua_data, how='outer', on='BUA code').merge(communities_bua_data, how='outer', on='BUA code').iloc[:,[0,1,2,3,5,6,8,9]]\n",
    "bua_other_characteristics_df.columns = ['BUA Code', 'BUA Name', 'Pub Count', 'Pubs per 100,000 people', 'Religious Worship Sites Count', 'Religious Worship Sites per 100,000 people', 'Community Facility Count', 'Community Facilities per 100,000 people']\n",
    "greenspace_bua_data = pd.read_excel(\"Data/buafunctionalgreenspacearea.xlsx\", sheet_name=\"Table 2\", skiprows=4).iloc[:,[0,1,2,3,4]]\n",
    "greenspace_bua_data.columns = ['BUA Code', 'BUA Name', 'Area (m2)', 'Functional greenspace area (m2)', 'Functional greenspace area percentage (%)']\n",
    "bua_other_characteristics_df = greenspace_bua_data.merge(bua_other_characteristics_df.iloc[:,1:], how='left', on='BUA Name')\n",
    "bua_gdf_ew_some_data = bua_gdf_ew_pop.merge(bua_other_characteristics_df, how='left', on=['BUA Code', 'BUA Name']) # will give other characteristics for Medium or larger BUA size classification BUAs\n",
    "bua_gdf_ew_some_data['Area in ha percentile (%)'] = bua_gdf_ew_some_data['Area in ha (2dp)'].rank(pct=True) * 100.00\n",
    "bua_gdf_ew_some_data['word_limit'] = (20 + round(((bua_gdf_ew_some_data['Area in ha percentile (%)']/100.00) * 140),0))\n",
    "counties_gdf = gpd.read_file(\"Data/Upper_Tier_Local_Authorities_December_2022_Boundaries_UK_BFC.gpkg\").to_crs(\"EPSG:4326\") # Source: https://geoportal.statistics.gov.uk/datasets/f6c95adbbc2949b5a63fd97833562d2e_0/explore\n",
    "centroid_data = gpd.read_file(\"Data/ONS_BUA_2022_GB_BGG.gpkg\").to_crs(\"EPSG:4326\")[['BUA22CD', 'LONG', 'LAT']]\n",
    "centroids = gpd.GeoDataFrame(data=centroid_data, geometry=gpd.points_from_xy(centroid_data['LONG'], centroid_data['LAT']), crs=\"EPSG:4326\")\n",
    "bua_counties_df = centroids.sjoin(counties_gdf[['UTLA22NM','geometry']], how='inner', predicate='covered_by')[['BUA22CD','UTLA22NM']]\n",
    "bua_counties_df.columns = ['BUA Code', 'County Name']\n",
    "bua_gdf_ew_some_data_joined = bua_gdf_ew_some_data.merge(bua_counties_df, how='inner', on='BUA Code')\n",
    "bua_gdf_ew_some_data_joined"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "21e61fb4",
   "metadata": {},
   "outputs": [],
   "source": [
    "bua_2022_gdf = bua_gdf_ew_some_data_joined.copy()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b64e3696",
   "metadata": {},
   "source": [
    "### 2. Enriching with osmlanduse Land Use Land Cover Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5da7a976",
   "metadata": {},
   "outputs": [],
   "source": [
    "# # PRE-PROCESSING AND FILTERING OSMLANDUSE DATASET FOR GREAT BRITAIN\n",
    "\n",
    "# import rasterio, rasterio.features\n",
    "# iterations_number = 6\n",
    "\n",
    "# def chop_and_save_file(iteration, save=True):\n",
    "#     global uk_lulc_data, data, transform, profile\n",
    "#     with rasterio.open(\"Data/LULC_DATA/UK_10m_3035_tiled.tif\") as uk_lulc_data: # \"Data/LULC_DATA/UK_10m_3035_tiled.tif\"\n",
    "#         row_interval = uk_lulc_data.shape[0] / iterations_number\n",
    "#         row_start, row_stop = (iteration-1)*row_interval, (iteration)*row_interval\n",
    "#         col_start, col_stop = 0, uk_lulc_data.shape[1]\n",
    "#         win = rasterio.windows.Window(col_start, row_start, (col_stop-col_start), (row_stop-row_start))\n",
    "#         data = uk_lulc_data.read(window=win) # bands=1\n",
    "#         transform = uk_lulc_data.window_transform(win)\n",
    "#         profile=uk_lulc_data.profile.copy()\n",
    "#         profile.update({\"height\": win.height, \"width\": win.width, \"transform\": transform})\n",
    "#     if save:\n",
    "#         with rasterio.open(f\"cropped_by_pixel_lulc_data_{iteration}.tif\", \"w\", **profile) as dst:\n",
    "#             dst.write(data)\n",
    "#     return None\n",
    "\n",
    "# def read_in_tif_as_gdf(iteration=1):\n",
    "#     with rasterio.open(f\"cropped_by_pixel_lulc_data_{iteration}.tif\") as uk_lulc_data_part: # \"Data/LULC_DATA/UK_10m_3035_tiled.tif\"\n",
    "#         array = uk_lulc_data_part.read(1)\n",
    "#         mask  = array != uk_lulc_data_part.nodata\n",
    "#         # Generate GeoJSON‐like dicts for each contiguous zone, then load into a geopandas geodataframe\n",
    "#         results = ({\"properties\": {\"value\": v}, \"geometry\": geom} for geom, v\n",
    "#             in rasterio.features.shapes(array, mask=mask, transform=uk_lulc_data_part.transform))\n",
    "#         gdf = gpd.GeoDataFrame.from_features(results, crs=uk_lulc_data_part.crs)\n",
    "#         gdf.columns = ['geometry', 'lulc_categorisation_number']\n",
    "#         gdf = gdf[gdf['lulc_categorisation_number'] != 0.00].reset_index(drop=True)\n",
    "#         return gdf\n",
    "\n",
    "# import time\n",
    "# from contextlib import contextmanager\n",
    "# @contextmanager\n",
    "# def timer(description=\"Operation\", logger=None):\n",
    "#     start_time = time.perf_counter()\n",
    "#     yield #runs with block\n",
    "#     end_time = time.perf_counter()\n",
    "#     elapsed_time = end_time - start_time\n",
    "#     if logger is not None:\n",
    "#         logger.info(f\"{description}: {elapsed_time:.4f} seconds\")\n",
    "#     else:\n",
    "#         print(f\"{description}: {elapsed_time:.4f} seconds\")\n",
    "\n",
    "# import dask_geopandas as dgpd\n",
    "\n",
    "# if __name__ == \"__main__\":\n",
    "    \n",
    "#     with timer(f\"BUA Reading/Clipping Processing\"): # Timer 0\n",
    "#         bua_2022_gdf_for_processing = gpd.read_file(\"BUA_2022_GB.gpkg\").to_crs('EPSG:4326')[['geometry']]\n",
    "#         bua_2022_gdf_clip_mask_dissolved = gpd.read_file(\"BUA_2022_GB.gpkg\").to_crs('EPSG:4326').dissolve()[['geometry']]\n",
    "    \n",
    "#     for iteration_i in range(1,(iterations_number+1)):\n",
    "        \n",
    "#         with timer(f\"TIF formatting iteration {iteration_i}\"): # Timer 1\n",
    "#             chop_and_save_file(iteration_i, save=True)\n",
    "#             uk_gdf = read_in_tif_as_gdf(iteration=iteration_i).to_crs('EPSG:4326')\n",
    "#             print(uk_gdf.info()) # before characteristics visualisation\n",
    "        \n",
    "#         with timer(f\"Geospatial intersecting iteration {iteration_i}\"): # Timer 2\n",
    "#             # INTERSECTING\n",
    "#             uk_gdf_dist_left = dgpd.from_geopandas(uk_gdf, npartitions=12)\n",
    "#             bua_gdf_dist_right = dgpd.from_geopandas(bua_2022_gdf_for_processing, npartitions=1)\n",
    "#             dist_lazy_joined = dgpd.sjoin(uk_gdf_dist_left, bua_gdf_dist_right, how=\"inner\", predicate=\"intersects\").reset_index(drop=True)\n",
    "#             uk_gdf_intersections = dist_lazy_joined.compute()\n",
    "#             print(uk_gdf_intersections.info()) # after distributed sjoin characteristics visualisation\n",
    "            \n",
    "#         with timer(f\"Geospatial clipping iteration {iteration_i}\"): # Timer 3\n",
    "#             # CLIPPING (LESS EFFICIENT, SO SUBSET TO INTERSECTS FIRST)\n",
    "#             uk_gdf_intersections_dist_left = dgpd.from_geopandas(uk_gdf_intersections, npartitions=12)\n",
    "#             uk_gdf_clipped_lazy = dgpd.clip(uk_gdf_intersections_dist_left, bua_2022_gdf_clip_mask_dissolved, keep_geom_type=False)\n",
    "#             uk_gdf_clipped = uk_gdf_clipped_lazy.compute()\n",
    "#             print(uk_gdf_clipped.info()) # after distributed clip characteristics visualisation\n",
    "            \n",
    "#         uk_gdf_clipped.to_file(f\"lulc_clipped_{iteration_i}.gpkg\", driver=\"GPKG\", layer=f\"land_use_land_cover_clipped_{iteration_i}\") # saving\n",
    "\n",
    "# overall_lulc_gb_data_gdf = gpd.GeoDataFrame(data={'geometry': []}, crs='EPSG:4326')\n",
    "# for iteration_i in range(1, (iterations_number+1)):\n",
    "#     read_in_gdf = gpd.read_file(f\"lulc_clipped_{iteration_i}.gpkg\")\n",
    "#     overall_lulc_gb_data_gdf = pd.concat([overall_lulc_gb_data_gdf, read_in_gdf], ignore_index=True)\n",
    "\n",
    "# classification_data_list = [[5.0, \"Water Bodies\", \"#00ccf2\"], \n",
    "#                            [11.0, \"Urban Fabric\", \"#e6004d\"], \n",
    "#                            [12.0, \"Industrial, Commercial and Transport Units\", \"#cc4df2\"], \n",
    "#                            [13.0, \"Mine, Dump and Construction Sites\", \"#a600cc\"], \n",
    "#                            [14.0, \"Artificial, Non-Agricultural Vegetated Areas\", \"#ffa6ff\"], \n",
    "#                            [21.0, \"Arable Land\", \"#ffffa8\"], \n",
    "#                            [22.0, \"Permanent Crops\", \"#e68000\"], \n",
    "#                            [23.0, \"Pastures\", \"#e6e64d\"], \n",
    "#                            [31.0, \"Forests\", \"#4dff00\"], \n",
    "#                            [32.0, \"Shrub and/or Herbaceous Vegetation Associations\", \"#ccf24d\"], \n",
    "#                            [33.0, \"Open Spaces with Little or No Vegetation\", \"#e6e6e6\"], \n",
    "#                            [41.0, \"Inland Wetlands\", \"#a6a6ff\"], \n",
    "#                            [42.0, \"Coastal Wetlands\", \"#e6e6ff\"]]\n",
    "# classification_df = pd.DataFrame(classification_data_list, columns=['lulc_categorisation_number', 'lulc_categorisation', 'hex_color'])\n",
    "# gb_gdf = overall_lulc_gb_data_gdf.merge(classification_df, how='left', on='lulc_categorisation_number')\n",
    "# gb_gdf.to_file(\"gb_lulc_data_buas_2022_undissolved.gpkg\", driver=\"GPKG\", layer=\"land_use_land_cover\")\n",
    "# print(gb_gdf.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7a988d34",
   "metadata": {},
   "outputs": [],
   "source": [
    "# # CALCULATE LAND USE PROPORTIONS PER BUA GDF ENTRY\n",
    "# gb_gdf = gpd.read_file(\"gb_lulc_data_buas_2022_undissolved.gpkg\")[['geometry','hex_color','lulc_categorisation']].drop_duplicates()\n",
    "\n",
    "# land_use_dict_list = []\n",
    "# for bua_i in range(0,len(bua_2022_gdf)):\n",
    "#     print(f\"NOW PROCESSING BUA {bua_i}\") # Progress visualisation\n",
    "#     bua_i_entry = bua_2022_gdf.iloc[bua_i:bua_i+1,:]\n",
    "#     combined_areas = gb_gdf.copy().sjoin(bua_i_entry, how='inner', predicate='intersects') # Predicates in geopandas sjoin(): 'dwithin', 'covers', 'contains_properly', 'contains', 'within', 'intersects', None, 'overlaps', 'touches', 'covered_by', 'crosses'\n",
    "#     combined_areas_clipped = combined_areas.clip(bua_i_entry)\n",
    "#     combined_areas_clipped_dissolved = combined_areas_clipped.dissolve(by='lulc_categorisation').reset_index()\n",
    "#     combined_areas_clipped_dissolved['area_value'] = combined_areas_clipped_dissolved.to_crs('EPSG:27700').area\n",
    "#     total_lulc_categorised_area = combined_areas_clipped_dissolved['area_value'].sum()\n",
    "#     combined_areas_clipped_dissolved['area_percentage_2dp'] = round((combined_areas_clipped_dissolved['area_value'] * 100.00 / total_lulc_categorised_area),2)\n",
    "#     land_use_dict_list += [dict(zip(combined_areas_clipped_dissolved['lulc_categorisation'].tolist(), combined_areas_clipped_dissolved['area_percentage_2dp'].tolist()))]\n",
    "\n",
    "# bua_2022_gdf['lulc_proportions_dict_values'] = land_use_dict_list\n",
    "# bua_2022_gdf.to_file(\"bua_2022_gdf_pre_processed_lulc_data_no_duplicates.gpkg\", driver=\"GPKG\", layer=\"bua_layer_1\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ce446bdb",
   "metadata": {},
   "outputs": [],
   "source": [
    "bua_2022_gdf = gpd.read_file(\"bua_2022_gdf_pre_processed_lulc_data_no_duplicates.gpkg\")\n",
    "bua_2022_gdf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b99d1952",
   "metadata": {},
   "outputs": [],
   "source": [
    "# arable_values = []\n",
    "# pasture_values = []\n",
    "# permanent_crop_values = []\n",
    "# inland_wetland_values = []\n",
    "# coastal_wetland_values = []\n",
    "# forest_values = []\n",
    "# shrub_values = []\n",
    "# water_values = []\n",
    "\n",
    "# open_spaces_little_vegetation_values = []\n",
    "# artificial_green_values = []\n",
    "\n",
    "# industrial_values = []\n",
    "# urban_fabric_values = []\n",
    "# mine_values = []\n",
    "\n",
    "\n",
    "# for i in range(0,len(bua_2022_gdf)):\n",
    "    \n",
    "#     # COUNTRYSIDE\n",
    "#     try:\n",
    "#         arable_values += [bua_2022_gdf['lulc_proportions_dict_values'][i]['Arable Land']]\n",
    "#     except:\n",
    "#         arable_values += [0.00]\n",
    "#     try:\n",
    "#         pasture_values += [bua_2022_gdf['lulc_proportions_dict_values'][i]['Pastures']]\n",
    "#     except:\n",
    "#         pasture_values += [0.00]\n",
    "#     try:\n",
    "#         permanent_crop_values += [bua_2022_gdf['lulc_proportions_dict_values'][i]['Permanent Crops']]\n",
    "#     except:\n",
    "#         permanent_crop_values += [0.00]\n",
    "#     try:\n",
    "#         inland_wetland_values += [bua_2022_gdf['lulc_proportions_dict_values'][i]['Inland Wetlands']]\n",
    "#     except:\n",
    "#         inland_wetland_values += [0.00]\n",
    "#     try:\n",
    "#         coastal_wetland_values += [bua_2022_gdf['lulc_proportions_dict_values'][i]['Coastal Wetlands']]\n",
    "#     except:\n",
    "#         coastal_wetland_values += [0.00]\n",
    "#     try:\n",
    "#         forest_values += [bua_2022_gdf['lulc_proportions_dict_values'][i]['Forests']]\n",
    "#     except:\n",
    "#         forest_values += [0.00]\n",
    "#     try:\n",
    "#         shrub_values += [bua_2022_gdf['lulc_proportions_dict_values'][i]['Shrub and/or Herbaceous Vegetation Associations']]\n",
    "#     except:\n",
    "#         shrub_values += [0.00]    \n",
    "#     try:\n",
    "#         water_values += [bua_2022_gdf['lulc_proportions_dict_values'][i]['Water Bodies']]\n",
    "#     except:\n",
    "#         water_values += [0.00]\n",
    "\n",
    "#     # SEMI-URBAN\n",
    "#     try:\n",
    "#         open_spaces_little_vegetation_values += [bua_2022_gdf['lulc_proportions_dict_values'][i]['Open Spaces with Little or No Vegetation']]\n",
    "#     except:\n",
    "#         open_spaces_little_vegetation_values += [0.00]    \n",
    "#     try:\n",
    "#         artificial_green_values += [bua_2022_gdf['lulc_proportions_dict_values'][i]['Artificial, Non-Agricultural Vegetated Areas']]\n",
    "#     except:\n",
    "#         artificial_green_values += [0.00]\n",
    "    \n",
    "#     # URBAN\n",
    "#     try:\n",
    "#         industrial_values += [bua_2022_gdf['lulc_proportions_dict_values'][i]['Industrial, Commercial and Transport Units']]\n",
    "#     except:\n",
    "#         industrial_values += [0.00]\n",
    "#     try:\n",
    "#         urban_fabric_values += [bua_2022_gdf['lulc_proportions_dict_values'][i]['Urban Fabric']]\n",
    "#     except:\n",
    "#         urban_fabric_values += [0.00]\n",
    "#     try:\n",
    "#         mine_values += [bua_2022_gdf['lulc_proportions_dict_values'][i]['Mine, Dump and Construction Sites']]\n",
    "#     except:\n",
    "#         mine_values += [0.00]\n",
    "\n",
    "# bua_2022_gdf['arable_land_use_percent'] = arable_values\n",
    "# bua_2022_gdf['pasture_land_use_percent'] = pasture_values\n",
    "# bua_2022_gdf['permanent_crops_land_use_percent'] = permanent_crop_values\n",
    "# bua_2022_gdf['inland_wetland_land_use_percent'] = inland_wetland_values\n",
    "# bua_2022_gdf['coastal_wetland_land_use_percent'] = coastal_wetland_values\n",
    "# bua_2022_gdf['forest_land_use_percent'] = forest_values\n",
    "# bua_2022_gdf['shrub_land_use_percent'] = shrub_values\n",
    "# bua_2022_gdf['water_land_use_percent'] = water_values\n",
    "\n",
    "# bua_2022_gdf['open_spaces_no_veg_land_use_percent'] = open_spaces_little_vegetation_values\n",
    "# bua_2022_gdf['artificial_veg_land_use_percent'] = artificial_green_values\n",
    "\n",
    "# bua_2022_gdf['industrial_land_use_percent'] = industrial_values\n",
    "# bua_2022_gdf['urban_fabric_land_use_percent'] = urban_fabric_values\n",
    "# bua_2022_gdf['mine_land_use_percent'] = mine_values\n",
    "\n",
    "# bua_2022_gdf['countryside_land_use_percent'] = round((bua_2022_gdf['arable_land_use_percent'] + bua_2022_gdf['pasture_land_use_percent'] + bua_2022_gdf['permanent_crops_land_use_percent'] + bua_2022_gdf['inland_wetland_land_use_percent'] + bua_2022_gdf['coastal_wetland_land_use_percent'] + bua_2022_gdf['forest_land_use_percent'] + bua_2022_gdf['shrub_land_use_percent'] + bua_2022_gdf['water_land_use_percent']),2)\n",
    "# bua_2022_gdf['semi_urban_land_use_percent'] = bua_2022_gdf['open_spaces_no_veg_land_use_percent'] + bua_2022_gdf['artificial_veg_land_use_percent']\n",
    "# bua_2022_gdf['urban_land_use_percent'] = bua_2022_gdf['industrial_land_use_percent'] + bua_2022_gdf['urban_fabric_land_use_percent'] + bua_2022_gdf['mine_land_use_percent']\n",
    "# bua_2022_gdf['countryside_land_use_percent'].hist(bins=30, edgecolor='white')\n",
    "# plt.xlim(0,100)\n",
    "# plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ec2f339c",
   "metadata": {},
   "source": [
    "### 3. Enriching with OpenStreetMap Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d3237518",
   "metadata": {},
   "outputs": [],
   "source": [
    "import osmnx\n",
    "pd.set_option('display.max_columns', 1000)\n",
    "osmnx.settings.use_cache = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d39076ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "bua_2022_gdf['area_ha_2dp'] = round((bua_2022_gdf.to_crs('EPSG:27700')['geometry'].area / 10000), 2)\n",
    "bua_2022_gdf.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a6081603",
   "metadata": {},
   "outputs": [],
   "source": [
    "bua_gdf = gpd.read_file(\"Data/ONS_BUA_2022_GB_BGG.gpkg\").to_crs(\"EPSG:4326\")\n",
    "bua_gdf_ew = bua_gdf[bua_gdf['BUA22CD'].str[0] != \"S\"].reset_index(drop=True).drop(columns=['BUA22NMW', 'BUA22NMG', 'BNG_E', 'BNG_N'])\n",
    "bua_gdf_ew['area_ha_2dp'] = round((bua_gdf_ew.to_crs('EPSG:27700')['geometry'].area / 10000), 2)\n",
    "bua_gdf_ew.info()\n",
    "# bua_gdf_ew.explore()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1bfaba9b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# # OSM POIs for Future Research: Viewpoints, Community Centres, Marketplace, Bus Stops, Train Stations, Places of Worship, Parks, Post Offices, Museums, Tourist Attractions\n",
    "\n",
    "# # OSM BUILDINGS & ROADS METRICS DATA EXTRACTION PER BUA\n",
    "\n",
    "# gdf = bua_2022_gdf.copy()\n",
    "\n",
    "# buildings_res_count_list = []\n",
    "# buildings_res_density_per_ha_list = []\n",
    "# roads_street_density_list = []\n",
    "# roads_intersection_density_list = []\n",
    "# roads_circuity_averages_list = []\n",
    "# roads_dead_ends_list = []\n",
    "\n",
    "# def import_and_process_osm_buildings(bua_to_include_list, building_type=\"residential\"):\n",
    "#     polygon = gdf[gdf['BUA Name'].isin(bua_to_include_list)].reset_index(drop=True).dissolve()['geometry'][0]\n",
    "#     tags_all = {\"building\": True}\n",
    "#     try:\n",
    "#         features_all_buildings = osmnx.features.features_from_polygon(polygon=polygon, tags=tags_all).reset_index()\n",
    "#         tags_accommodation = {\"building\": [\"apartments\", \"barracks\", \"bungalow\", \"cabin\", \"detached\", \"annexe\", \n",
    "#                                         \"dormitory\", \"farm\", \"ger\", \"house\", \"houseboat\", \"residential\", \"semidetached_house\",\n",
    "#                                         \"static caravan\", \"stilt_house\", \"terrace\", \"tree_house\", \"trullo\"]}\n",
    "#         features_residential_buildings = osmnx.features.features_from_polygon(polygon=polygon, tags=tags_accommodation).reset_index()\n",
    "#         residential_building_ids = features_residential_buildings['id'].unique()\n",
    "#         if building_type == \"residential\":\n",
    "#             special_buildings = features_all_buildings[features_all_buildings['id'].isin(residential_building_ids)].reset_index(drop=True)\n",
    "#         else:\n",
    "#             special_buildings = features_all_buildings[~features_all_buildings['id'].isin(residential_building_ids)].reset_index(drop=True)\n",
    "#         special_buildings_to_map = special_buildings[['geometry', 'name']]\n",
    "#         special_buildings_to_map.loc[:,'name'] = np.where(special_buildings_to_map['name'].isna(), \"Unknown\", special_buildings_to_map['name'])\n",
    "#         return special_buildings_to_map\n",
    "#     except:\n",
    "#         return gpd.GeoDataFrame(columns=['geometry','name']) # otherwise return empty gdf\n",
    "\n",
    "# for bua_i in range(0,len(gdf)):\n",
    "#     print(bua_i) # Progress visualisation\n",
    "#     bua_row = gdf.iloc[bua_i:bua_i+1,:].reset_index(drop=True)\n",
    "\n",
    "#     if (((bua_i % 250) == 0) | ((bua_i+1) == len(gdf))): # saved backup of extraction progress in case of unexpected runtime failures etc., including on skipped iterations\n",
    "#         pd.DataFrame(data={'osm_buildings_residential_count': buildings_res_count_list,\n",
    "#                            'osm_buildings_residential_density_per_ha': buildings_res_density_per_ha_list,\n",
    "#                            'osm_roads_street_density_per_km': roads_street_density_list,\n",
    "#                            'osm_roads_intersection_density_per_km': roads_intersection_density_list,\n",
    "#                            'osm_roads_circuity_average': roads_circuity_averages_list,\n",
    "#                            'osm_roads_proportion_dead_ends': roads_dead_ends_list}).to_csv(\"osm_extraction_saved_progress.csv\")\n",
    "        \n",
    "#     # if index of row not in a row of medium, large or major, skip processing the row, and move to next bua_i iteration\n",
    "#     if not bua_i in bua_gdf_ew_some_data_joined[bua_gdf_ew_some_data_joined['BUA Size Classification'].isin(['Medium','Large','Major'])].index.tolist():\n",
    "#         buildings_res_count_list += [np.nan]\n",
    "#         buildings_res_density_per_ha_list += [np.nan]\n",
    "#         roads_street_density_list += [np.nan]\n",
    "#         roads_intersection_density_list += [np.nan]\n",
    "#         roads_circuity_averages_list += [np.nan]\n",
    "#         roads_dead_ends_list += [np.nan]\n",
    "\n",
    "#         if ((bua_i+1) == len(gdf)): # final saving of values etc.\n",
    "#             pd.DataFrame(data={'osm_buildings_residential_count': buildings_res_count_list,\n",
    "#                            'osm_buildings_residential_density_per_ha': buildings_res_density_per_ha_list,\n",
    "#                            'osm_roads_street_density_per_km': roads_street_density_list,\n",
    "#                            'osm_roads_intersection_density_per_km': roads_intersection_density_list,\n",
    "#                            'osm_roads_circuity_average': roads_circuity_averages_list,\n",
    "#                            'osm_roads_proportion_dead_ends': roads_dead_ends_list}).to_csv(\"osm_extraction_saved_progress_final.csv\")\n",
    "\n",
    "#         continue     \n",
    "\n",
    "#     # OSM BUILDINGS DATA EXTRACTION\n",
    "#     buildings_res = len(import_and_process_osm_buildings(bua_row['BUA Name'], building_type='residential'))\n",
    "#     buildings_res_count_list += [buildings_res]\n",
    "#     buildings_res_density_per_ha_list += [round((buildings_res / (bua_row['area_ha_2dp'][0])),2)]\n",
    "\n",
    "#     # OSM ROADS DATA EXTRACTION\n",
    "#     try:\n",
    "#         roads_for_bua = osmnx.graph_from_polygon(polygon=bua_row['geometry'][0], network_type=\"drive\", simplify=True, truncate_by_edge=True)\n",
    "#         osmnx_basic_stats = osmnx.stats.basic_stats(G=roads_for_bua, area=(bua_row['area_ha_2dp'][0] * 10000)) # * 10000 to get to m2 from ha\n",
    "#         roads_street_density_list += [osmnx_basic_stats['street_density_km']]\n",
    "#         roads_intersection_density_list += [osmnx_basic_stats['intersection_density_km']]\n",
    "#         roads_circuity_averages_list += [osmnx_basic_stats['circuity_avg']]\n",
    "#         roads_dead_ends_list += [osmnx_basic_stats['streets_per_node_proportions'][1] * 100.00]\n",
    "\n",
    "#     except:\n",
    "#         roads_street_density_list += [np.nan]\n",
    "#         roads_intersection_density_list += [np.nan]\n",
    "#         roads_circuity_averages_list += [np.nan]\n",
    "#         roads_dead_ends_list += [np.nan]\n",
    "\n",
    "#     if ((bua_i+1) == len(gdf)): # final saving of values etc.\n",
    "#         pd.DataFrame(data={'osm_buildings_residential_count': buildings_res_count_list,\n",
    "#                            'osm_buildings_residential_density_per_ha': buildings_res_density_per_ha_list,\n",
    "#                            'osm_roads_street_density_per_km': roads_street_density_list,\n",
    "#                            'osm_roads_intersection_density_per_km': roads_intersection_density_list,\n",
    "#                            'osm_roads_circuity_average': roads_circuity_averages_list,\n",
    "#                            'osm_roads_proportion_dead_ends': roads_dead_ends_list}).to_csv(\"osm_extraction_saved_progress_final.csv\")\n",
    "\n",
    "# gdf.loc[:,'osm_buildings_residential_count'] = buildings_res_count_list\n",
    "# gdf.loc[:,'osm_buildings_residential_density_per_ha'] = buildings_res_density_per_ha_list\n",
    "# gdf.loc[:,'osm_roads_street_density_per_km'] = roads_street_density_list\n",
    "# gdf.loc[:,'osm_roads_intersection_density_per_km'] = roads_intersection_density_list\n",
    "# gdf.loc[:,'osm_roads_circuity_average'] = roads_circuity_averages_list\n",
    "# gdf.loc[:,'osm_roads_proportion_dead_ends'] = roads_dead_ends_list\n",
    "\n",
    "# gdf.to_csv(\"domesday_gdf_prepared_final.csv\", index=False)\n",
    "\n",
    "# gdf"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7d304e00",
   "metadata": {},
   "source": [
    "### 4. Querying OpenAI LLM using enriched query"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ed646e4a",
   "metadata": {},
   "outputs": [],
   "source": [
    "from shapely import wkt\n",
    "df = pd.read_csv(\"domesday_gdf_prepared_final.csv\")\n",
    "df['geometry'] = df['geometry'].apply(wkt.loads)\n",
    "bua_osm_gdf = gpd.GeoDataFrame(data=df, geometry='geometry', crs='EPSG:4326')\n",
    "bua_2022_gdf = bua_osm_gdf.merge(bua_2022_gdf[['BUA Code', 'lulc_proportions_dict_values']], how='left', on='BUA Code')\n",
    "bua_2022_gdf.info()\n",
    "bua_2022_gdf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c6438271",
   "metadata": {},
   "outputs": [],
   "source": [
    "bua_2022_gdf['word_limit'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6ef59eaf",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_summary_of_place_from_openai(place, subregion, region, word_limit, residents, land_use_dict_values, \n",
    "                                     long_form=False, pub_count=None, worship_sites_count=None, community_facilities_density_by_population=None,\n",
    "                                     buildings_density_value=None, dead_end_roads_value=None, completion_notifications=True):\n",
    "\n",
    "  if long_form: # \n",
    "    input_prompt_text = f\"Write a concise, non-effusive description of ideally under {word_limit} words describing some of \\\n",
    "        the landmarks, socio-economic characteristics, cultural influences and/or topographical environment of the place of {place} in {subregion}, {region}, UK. \\\n",
    "          Write as one continuous, short paragraph. Do not write generic statements. \\\n",
    "            The following background facts can be selectively incorporated, with land use statistics referenced only occasionally (and with generalised land use type namings) so as to not interrupt the flow: \\\n",
    "              {place} had an official resident population of {residents} in 2021 (ONS). \\\n",
    "                Additionally, land use type percentage values follow here as a Python dictionary of format land use type (key): percentage (value): {land_use_dict_values}. \\\n",
    "                  There are {buildings_density_value} residential buildings per ha, and {dead_end_roads_value} proportion of roads are dead-ends, according to OpenStreetMap data. \\\n",
    "                    There are a total count of {pub_count} pubs, {worship_sites_count} total worship sites, and a facility density of {community_facilities_per_100k} community facilities per 100,000 people, according to ONS statistics. \\\n",
    "                      Ideal {word_limit} word limit.\"\n",
    "  else:\n",
    "    input_prompt_text = f\"Write a concise, non-effusive description of ideally under {word_limit} words describing some of \\\n",
    "        the landmarks, socio-economic characteristics, cultural influences and/or topographical environment of the place of {place} in {subregion}, {region}, UK. \\\n",
    "          Write as one continuous, short paragraph. Do not write generic statements. \\\n",
    "            The following background facts can be selectively incorporated, with land use statistics referenced only occasionally (and with generalised land use type namings) so as to not interrupt the flow: \\\n",
    "              {place} had an official resident population of {residents} in 2021 (ONS). \\\n",
    "                Additionally, land use type percentage values follow here as a Python dictionary of format land use type (key): percentage (value): {land_use_dict_values}. \\\n",
    "                  Ideal {word_limit} word limit.\"\n",
    "\n",
    "  \n",
    "  response = client.responses.create(\n",
    "    model=\"gpt-4o-mini\",\n",
    "    input=[\n",
    "        {\"role\": \"system\", \"content\": \"You are a writer of short place descriptions of preferably no more than 160 words under any circumstances.\"},\n",
    "        {\"role\": \"user\", \"content\": input_prompt_text}\n",
    "    ], # previously input=input_promnpt_text\n",
    "    store=True,\n",
    "    service_tier=\"priority\"\n",
    "  )\n",
    "\n",
    "  ai_output_text = response.output_text\n",
    "  ai_output_text = ai_output_text.replace(\"\\n\\n\", \" \")\n",
    "\n",
    "  if completion_notifications:\n",
    "    print(f\"Finished processing for {place} in {subregion} in {region} (Words: \" + str(len(ai_output_text.split(\" \"))) + \")\")\n",
    "\n",
    "  return ai_output_text\n",
    "\n",
    "\n",
    "place_summary_responses_list = []\n",
    "for place_i in range(0,len(bua_2022_gdf)):\n",
    "    print(place_i) # Progress visualisation\n",
    "    place = bua_2022_gdf.iloc[place_i]['BUA Name']\n",
    "    subregion = bua_2022_gdf.iloc[place_i]['County Name']\n",
    "    region = bua_2022_gdf.iloc[place_i]['Country']\n",
    "    residents = bua_2022_gdf.iloc[place_i]['2021 Resident Population']\n",
    "    land_use_dict = bua_2022_gdf.iloc[place_i]['lulc_proportions_dict_values']\n",
    "    word_limit = round((bua_2022_gdf.iloc[place_i]['word_limit']), 0)\n",
    "    if bua_2022_gdf.iloc[place_i]['BUA Size Classification'] in [\"Medium\", \"Large\", \"Major\"]: # if has extra data as BUA Size Medium or larger...\n",
    "      pubs = bua_2022_gdf.iloc[place_i]['Pub Count']\n",
    "      worship_sites = bua_2022_gdf.iloc[place_i]['Religious Worship Sites Count']\n",
    "      community_facilities_per_100k = bua_2022_gdf.iloc[place_i]['Community Facilities per 100,000 people']\n",
    "      housing_density_ha = bua_2022_gdf.iloc[place_i]['osm_buildings_residential_density_per_ha']\n",
    "      roads_dead_end = bua_2022_gdf.iloc[place_i]['osm_roads_proportion_dead_ends']\n",
    "      place_summary_responses_list += [get_summary_of_place_from_openai(place, subregion, region, residents, land_use_dict, word_limit, True, pubs, worship_sites, community_facilities_per_100k, housing_density_ha, roads_dead_end)]\n",
    "    else:\n",
    "       place_summary_responses_list += [get_summary_of_place_from_openai(place, subregion, region, residents, land_use_dict, word_limit, False, None, None, None, None, None)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ff352f4d",
   "metadata": {},
   "outputs": [],
   "source": [
    "bua_2022_gdf = bua_2022_gdf.iloc[0:len(place_summary_responses_list),:] # REVISIT\n",
    "\n",
    "bua_2022_gdf['place_summary_openai'] = place_summary_responses_list\n",
    "bua_2022_gdf_explore = bua_2022_gdf[['BUA Name', 'County Name', 'place_summary_openai', 'geometry']]\n",
    "bua_2022_gdf_explore.columns = [\"PLACE\", \"COUNTY\", \"OPENAI PLACE SUMMARY\", \"geometry\"]\n",
    "bua_2022_gdf_explore.to_csv(\"2512_british_towns_new_test_gdf_as_df.csv\", index=False)\n",
    "# bua_2022_gdf_explore = data = pd.read_csv(\"2512_british_towns_new_test_gdf_as_df.csv\")\n",
    "# Top 10 words in generated place descriptions: pd.Series([word for sublist in [[y for y in bua_2022_gdf_explore['OPENAI PLACE SUMMARY'].str.replace(\".\",\"\").str.replace(\",\",\"\").str.split(\" \")[i]] for i in range(0,len(bua_2022_gdf_explore))] for word in sublist]).value_counts().head(10)\n",
    "bua_2022_gdf_explore"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8cd70b34",
   "metadata": {},
   "outputs": [],
   "source": [
    "# INTEGRATE IN LLM DISTANCE ERROR RANK PERCENTILE DERIVED 'RELIABILITY SCORE' FROM PART 2 CODE, TO INTEGRATE INTO FINAL TEXT\n",
    "reliability_scores = pd.read_csv(\"2512_distance_scores_per_bua_final.csv\")\n",
    "bua_2022_gdf_explore_reliability = bua_2022_gdf_explore.copy().merge(reliability_scores, how='left', on='PLACE')\n",
    "bua_2022_gdf_explore_reliability['reliability_score_inverse_rank_percentile_2dp'] = round(((1 - bua_2022_gdf_explore_reliability['distance'].rank(pct=True)) * 100.00), 2)\n",
    "bua_2022_gdf_explore_reliability"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d09ba09d",
   "metadata": {},
   "source": [
    "### 5. Generating Interactive Output Map in Folium"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "56fc6dd4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# GENERATE INTERACTIVE OUTPUT MAP\n",
    "\n",
    "map = folium.Map([53.00, -2.00], tiles=None, zoom_start=8)\n",
    "\n",
    "tiles_osm_fg = folium.FeatureGroup(name=\"Basemap: OSM Standard\", show=True, control=True)\n",
    "folium.TileLayer(tiles=\"OpenStreetMap\", show=True, control=True).add_to(tiles_osm_fg)\n",
    "map.add_child(tiles_osm_fg)\n",
    "\n",
    "places_fg = folium.FeatureGroup(name=\"SETTLEMENTS OF BRITAIN\", show=True, control=False)\n",
    "\n",
    "search_fg = folium.FeatureGroup(name=\"Search\", show=False, control=False)\n",
    "search_layer = folium.GeoJson(bua_2022_gdf_explore, style_function=lambda x: {'opacity': 0.00, 'fillOpacity': 0.00}).add_to(search_fg)\n",
    "map.add_child(search_fg)\n",
    "folium.plugins.Search(layer=search_fg, geom_type='Polygon', search_label='PLACE', collapsed=True).add_to(map)\n",
    "\n",
    "for _, row_i in bua_2022_gdf_explore.iterrows():\n",
    "    json = gpd.GeoSeries(row_i['geometry']).to_json()\n",
    "    geo_json = folium.GeoJson(json, \n",
    "                              style_function=lambda x: {'fillColor': 'black', 'fillOpacity': 0.56, 'color': 'black', 'weight': 0.6, 'dashArray': \"2, 2\"},\n",
    "                              tooltip=folium.Tooltip(folium.Html(f\"\"\"<h1><strong>{row_i['PLACE'].upper()}</strong></h1>\n",
    "                                                                 <div style=\"width:350px; white-space:normal; word-wrap:break-word;\">\n",
    "                                                                 <i>{row_i['OPENAI PLACE SUMMARY']}</i>\n",
    "                                                                 </div>\n",
    "                                                                 \"\"\", script=True).render()),\n",
    "                              highlight_function=lambda x: {\"fillColor\": 'green'})\n",
    "    geo_json.add_to(places_fg)\n",
    "map.add_child(places_fg)\n",
    "\n",
    "fp.Fullscreen(title=\"Enter Full Screen\", force_separate_button=True).add_to(map)\n",
    "fp.MiniMap(tile_layer=\"OpenStreetMap\", toggle_display=True, zoom_animation=True, position=\"bottomleft\").add_to(map)\n",
    "folium.LayerControl(collapsed=True, draggable=True, position=\"bottomright\").add_to(map)\n",
    "\n",
    "map.save(\"2512_britain_places_output_map.html\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ba2def13",
   "metadata": {},
   "source": [
    "### 6. Producing 'Domesday Book 2025' document in Word/PDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "05dff6ba",
   "metadata": {},
   "outputs": [],
   "source": [
    "# WORD TEXT PROCESSING TEXT CONSTRUCTION/FORMATTING CODE\n",
    "import docx\n",
    "import datetime\n",
    "time_now = str(datetime.datetime.now()).split(\".\")[0]\n",
    "\n",
    "bua_2022_gdf_explore_reliability = bua_2022_gdf_explore_reliability.sort_values('PLACE').reset_index(drop=True)\n",
    "doc = docx.Document()\n",
    "\n",
    "def set_font_family(doc, font_name=\"Arial\", font_size=11):\n",
    "    for style in doc.styles:\n",
    "        if style.type in (docx.enum.style.WD_STYLE_TYPE.PARAGRAPH, docx.enum.style.WD_STYLE_TYPE.CHARACTER):\n",
    "            style.font.name = font_name\n",
    "            style._element.rPr.rFonts.set(docx.oxml.ns.qn(\"w:eastAsia\"), font_name)\n",
    "    return doc\n",
    "set_font_family(doc, \"Times New Roman\")\n",
    "\n",
    "doc.add_heading(f\"The Domesday Book: Project 2025\\n{time_now}\")\n",
    "doc.add_page_break()\n",
    "\n",
    "cols = docx.oxml.OxmlElement('w:cols')\n",
    "cols.set(docx.oxml.ns.qn('w:num'),'2')\n",
    "cols.set(docx.oxml.ns.qn('w:space'),str(int(docx.shared.Inches(0.25).pt)))\n",
    "doc.sections[0]._sectPr.append(cols)\n",
    "for section in doc.sections:\n",
    "    section.top_margin = section.bottom_margin = section.left_margin = section.right_margin = docx.shared.Cm(1.27)\n",
    "\n",
    "counties_list = sorted(bua_2022_gdf_explore_reliability['COUNTY'].unique().tolist())\n",
    "for county_i in counties_list:\n",
    "    if county_i == counties_list[0]:\n",
    "        doc.add_page_break()\n",
    "    doc.add_heading(county_i, level=3)\n",
    "    bua_2022_gdf_explore_i = bua_2022_gdf_explore_reliability[bua_2022_gdf_explore_reliability['COUNTY']==county_i].reset_index(drop=True)\n",
    "    for place_i in range(0, len(bua_2022_gdf_explore_i)):\n",
    "        doc.add_heading(str(bua_2022_gdf_explore_i.loc[place_i,'PLACE']) + \" [\" + str(bua_2022_gdf_explore_i.loc[place_i,'reliability_score_inverse_rank_percentile_2dp']) + \"]\", level=4)\n",
    "        run = doc.add_paragraph().add_run(bua_2022_gdf_explore_i.loc[place_i,'OPENAI PLACE SUMMARY'])\n",
    "        run.italic = True\n",
    "        run.font.size = docx.shared.Pt(8)\n",
    "\n",
    "doc.save(\"2512_domesday_document.docx\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "988e6c15",
   "metadata": {},
   "outputs": [],
   "source": [
    "# WORD TO PDF EXPORT CODE\n",
    "import win32com.client as win32\n",
    "# word = win32.gencache.EnsureDispatch(\"Word.Application\")\n",
    "word = win32.Dispatch(\"Word.Application\")\n",
    "word.Visible = True\n",
    "doc = word.Documents.Open(r\"C:\\Users\\georg\\Documents\\work\\2508_domesday_python_ai\\2512_domesday_document.docx\")\n",
    "doc.SaveAs(r\"C:\\Users\\georg\\Documents\\work\\2508_domesday_python_ai\\2512_domesday_document.pdf\", FileFormat=17)\n",
    "doc.Close()\n",
    "word.Quit()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fbea58dc",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Word Limit Monitor Testing\n",
    "bua_2022_gdf_test = bua_2022_gdf.copy()[['word_limit','place_summary_openai']]\n",
    "bua_2022_gdf_test['place_summary_openai'] = bua_2022_gdf_test['place_summary_openai'].str.split(\" \").str.len()\n",
    "bua_2022_gdf_test['words_over_limit'] = bua_2022_gdf_test['place_summary_openai'] - bua_2022_gdf_test['word_limit']\n",
    "bua_2022_gdf_test['words_over_limit'].hist(bins=20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "48af8d22",
   "metadata": {},
   "outputs": [],
   "source": [
    "bua_2022_gdf_test['place_summary_openai'].describe().to_csv(\"2512_place_description_length_summary_stats_final.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "47a2493b",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.rcParams['font.size'] = 16\n",
    "g = sns.displot(data=bua_2022_gdf_test, x='place_summary_openai', color='Black', edgecolor='White', bins=120, height=5, aspect=1.8)\n",
    "plt.title(\"Distribution of Written Place Summary Length(s)\\n\")\n",
    "plt.xlabel(\"\\nLength (Words)\")\n",
    "plt.ylabel(\"Count (Number of Places)\\n\")\n",
    "for ax in g.axes.flatten():\n",
    "    ax.axvline(x=160, color='red', linestyle='--', linewidth=1.4)\n",
    "    ax.text(\n",
    "        x=0.20, y=0.80,                # position in axis coordinates\n",
    "        s=\"160 Word 'Limit'\",\n",
    "        rotation=90,                 # vertical orientation\n",
    "        va='center', ha='center',\n",
    "        transform=ax.transAxes,      # use axis-relative coordinates\n",
    "        fontsize=12,\n",
    "        color=\"Red\",\n",
    "        bbox=dict(\n",
    "            boxstyle=\"round,pad=0.3\",\n",
    "            facecolor=\"White\",\n",
    "            edgecolor=\"White\",\n",
    "            alpha=0.8)\n",
    "            )\n",
    "plt.savefig(\"2512_written_length_histogram_figure.png\", dpi=600, bbox_inches='tight')\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
